Data Import

library(tidyverse)
library(readxl)
library(smacof)
library(ggplot2)

wyoming <- read_csv("~/Documents/Wyoming data import.csv")
AlaskaCVR <- read_excel("~/Documents/AlaskaCVR.xlsx", na = "under")

names<-colnames(wyoming)
names[6:10]<- c(1,2,3,4,5)
colnames(wyoming) <- names
wyoming_l <- wyoming %>%
  select(BallotID, `1` ,`2` ,`3` ,`4`, `5` ) %>%
  pivot_longer(-BallotID, names_to = 'rank', values_to = "candidate") %>%
  filter(!candidate %in% c('under','over')) %>%
  pivot_wider(names_from = candidate, values_from = rank, values_fn = first)
names<-colnames(AlaskaCVR)
names[7:11]<- c(1,2,3,4,5)
colnames(AlaskaCVR) <- names
AlaskaCVR$BallotID <- paste0(AlaskaCVR$`Tabulator Id`, AlaskaCVR$`Batch Id`, AlaskaCVR$`Record Id`)
alaska_l <- AlaskaCVR %>% 
  select(`BallotID`, `1` ,`2` ,`3` ,`4`, `5` ) %>%
  pivot_longer(-BallotID, names_to = 'rank', values_to = "candidate") %>%
  filter(!candidate %in% c('under','over', NA)) %>%
  pivot_wider(names_from = candidate, values_from = rank, values_fn = first)

# To do: Losing observations between wyoming and wyoming_l, why? 

Data Preprocessing

w_matrix <- wyoming_l[,2:10]
a_matrix <- alaska_l[,2:10]
# Impute missing values
# Current Implementation 
w_mat <- w_matrix %>%
    mutate_all(~replace(., is.na(.), 6)) %>% 
    mapply(FUN=as.numeric)
principle_components <- w_mat%>%prcomp()
w_pca <-bind_cols(wyoming_l,as_tibble(principle_components$x))

a_mat <- a_matrix %>%
    mutate_all(~replace(., is.na(.), 6)) %>%
    mapply(FUN=as.numeric)
principle_components <- a_mat%>%prcomp()
a_pca <-bind_cols(alaska_l,as_tibble(principle_components$x))

Cluster Analysis: Wyoming

w_cluster<-w_pca
library(broom)
# predict four clusters
w_kmeans <-kmeans(w_mat,centers = 4,# number of clusters
                  nstart = 100)# number of random starts

w_cluster$cluster<-w_kmeans$cluster

l_pca<-left_join(w_cluster, wyoming, by = 'BallotID' )
l_pca%>%
  group_by(cluster, `1.y`) %>%
  summarize(count=n()) %>%
  arrange(desc(count)) %>%
  spread(cluster,count)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## # A tibble: 11 x 5
##    `1.y`   `1`   `2`   `3`   `4`
##    <chr> <int> <int> <int> <int>
##  1 1      3799  2195  4105    NA
##  2 11       NA   235    17    41
##  3 12      113    11     7    17
##  4 3        14    16     8     9
##  5 4        NA    NA  1874  1806
##  6 5        NA   131    NA     8
##  7 7        48    30    28    27
##  8 8        NA   287    NA   401
##  9 9        27    74    26    26
## 10 over      5     3    NA     3
## 11 under     6     1     1     3
## Problem, because long form files have different length vs standard files, much harder to merge. 

It appears that Biden supports (1) are overwhelmingly in cluster 3, while Sanders supporters are largely in Cluster 2. Cluster 4 seems to be voters who liked both Warren but also Biden and Sanders. Cluster 1 looks to be dominated by Biden supporters who also liked some of the more eccentric also-rans like Bloomberg and Steyer.

ggplot(data = w_cluster) +
  geom_point(aes(x = PC1, y =PC2, color = factor(cluster)),alpha = .4) + 
  theme_minimal()

Clustering Alaska

a_cluster<-a_pca
library(broom)
# predict four clusters
a_kmeans <-kmeans(a_mat,centers = 4,# number of clusters
                  nstart = 100)# number of random starts

a_cluster$cluster<-a_kmeans$cluster

l_pca<-left_join(a_cluster, AlaskaCVR, by = "BallotID" )
l_pca%>%
  group_by(cluster, `1.y`) %>%
  summarize(count=n()) %>%
  arrange(desc(count)) %>%
  spread(cluster,count)
## `summarise()` regrouping output by 'cluster' (override with `.groups` argument)
## # A tibble: 10 x 5
##    `1.y`   `1`   `2`   `3`   `4`
##    <dbl> <int> <int> <int> <int>
##  1     1    13  2317  4300  3230
##  2     3    13    12    26    13
##  3     4  2915  2427    14  2406
##  4     5    15    35    77     4
##  5     7    62     5    27    30
##  6     8   110  1121   166     2
##  7     9    11    15    47    24
##  8    11    14    93   114    26
##  9    12    41     1    16     6
## 10    NA    13     5     9     8

It appears that Biden supports (1) are overwhelmingly in cluster 3, while Sanders supporters are largely in Cluster 2. Cluster 4 seems to be voters who liked both Warren but also Biden and Sanders. Cluster 1 looks to be dominated by Biden supporters who also liked some of the more eccentric also-rans like Bloomberg and Steyer.

ggplot(data = a_cluster) +
  geom_point(aes(x = PC1, y =PC2, color = factor(cluster)),alpha = .4) + 
  theme_minimal()

Are these clouds simmilar? Unclear.

Both

alaska_l$state<- 'alaska'
wyoming_l$state<- 'wyoming'
b_l<- rbind(alaska_l,wyoming_l)
b_matrix <- b_l[,2:10]

b_mat <- b_matrix %>%
    mutate_all(~replace(., is.na(.), 6)) %>%
    mapply(FUN=as.numeric)
principle_components_b <- b_mat%>%prcomp()
b_pca <-bind_cols(b_l,as_tibble(principle_components_b$x))
ggplot(data = b_pca) +
  geom_point(aes(x = PC1, y =PC2, color = state), alpha = .4) + 
  theme_minimal()

ggplot(data = b_pca) +
  geom_density(aes(x = PC1, fill = state),alpha = .4) + 
  theme_minimal()

Alaska was a much better state for Sanders then Wyoming, but Sanders still lost, suggests that the middle peak in this chart is Biden-Sanders + Sanders-Biden, and that the left peak is Biden only.

Clustering Both

b_cluster<-b_pca
# predict four clusters
b_kmeans <-kmeans(b_mat,centers = 4,# number of clusters
                  nstart = 100)# number of random starts

b_cluster$cluster<-b_kmeans$cluster
ggplot(data = b_cluster) +
  geom_point(aes(x = PC1, y =PC2, color = factor(cluster)),alpha = .4) + 
  theme_minimal()

ggplot(data = b_cluster) +
  geom_density(aes(x = PC1, fill = factor(cluster)),alpha = .4) + 
  theme_minimal()

Pretty clear that Cluster 2 is the Biden only cluster, 3 might be Biden-Sanders, 1 Sanders-Biden, 4 Sanders-Warren?

## This takes way too long, maybe a project for AWS!
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#fviz_nbclust(w_mat, FUN = kmeans, method = "wss")
# fviz_nbclust(pca_data, FUN = kmeans, method = "gap_stat") This one crashes because it runs out of memory
#fviz_nbclust(w_mat, FUN = kmeans, method = "silhouette")